Simulation of race results

library(dplyr)
library(lubridate)
library(plotly)

Reading and transforming the data of the 1500m finalists at the Paris 2024 Olympic Games. In addition, we keep only the data from the finals. In this way we ‘better’ simulate the race we are interested in. The performance of a runner in ‘non-finals’ does not have to match his performance in finals.

  JJOO_finalist_data_t_12 <- read.csv("./JJOO_finalist_data_t_12.csv")
sim1 = JJOO_finalist_data_t_12
sim1 = sim1 %>% 
           mutate(mark_s = minute(ms(sim1$mark)) + second(ms(sim1$mark))/60)
sim_f = sim1[sim1$race=="F",]
head(sim_f)
##     X race              competition                     date disciplineCode
## 3   3    F U.S. Olympic Team Trials 2024-06-24T00:00:00.000Z           1500
## 6   6    F The XXXIII Olympic Games 2024-08-06T00:00:00.000Z           1500
## 7   7    F    Athletissima Lausanne 2024-08-22T00:00:00.000Z           1500
## 8   8    F        Weltklasse Zürich 2024-09-05T00:00:00.000Z           1500
## 9   9    F       Memorial van Damme 2024-09-13T00:00:00.000Z           1500
## 10 10    F USA Indoor Championships 2024-02-17T00:00:00.000Z           1500
##       mark        Name   mark_s
## 3  3:30.59 Cole Hocker 3.509833
## 6  3:27.65 Cole Hocker 3.460833
## 7  3:29.85 Cole Hocker 3.497500
## 8  3:30.46 Cole Hocker 3.507667
## 9  3:30.94 Cole Hocker 3.515667
## 10 3:37.51 Cole Hocker 3.625167

The mean and sd of the times for the athletes involved are shown.

sim2 =sim_f %>%
              group_by(Name) %>%
              summarise(mean = mean(mark_s),sd=sqrt(var(mark_s)),min=min(mark_s))
sim2
## # A tibble: 12 × 4
##    Name                   mean     sd   min
##    <chr>                 <dbl>  <dbl> <dbl>
##  1 "Brian Komen"          3.60 0.0718  3.48
##  2 "Cole Hocker"          3.53 0.0536  3.46
##  3 "Hobbs Kessler"        3.56 0.0535  3.49
##  4 "Jakob Ingebrigtsen"   3.50 0.0366  3.45
##  5 "Josh Kerr"            3.50 0.0263  3.46
##  6 "NORDAS Narve Gilje "  3.56 0.0434  3.49
##  7 "Neil Gourley"         3.55 0.0504  3.51
##  8 "Niels Laros"          3.52 0.0156  3.49
##  9 "Pietro Arese"         3.59 0.0582  3.51
## 10 "Stefan Nillessen"     3.60 0.0801  3.51
## 11 "Timothy Cheruiyot"    3.55 0.0619  3.48
## 12 "Yared Nuguse"         3.50 0.0241  3.46

And drawing its density with the observed data.

p=ggplot(sim_f,aes(mark_s,col=Name))+geom_density()
library(plotly)
ggplotly(p)

In the following section the running times for each runner are simulated assuming a Gamma distribution.

# parámetros de una gamma
alpha=sim2$mean^2/sim2$sd^2
beta=sim2$sd^2/sim2$mean

Create a function for the simulation.

simula_class = function(N=100,class=matrix(0,N,12),alpha,beta,p){
  for (i in 1:N){
  res1=c(0,12)
  for (j in 1:12){
    res1[j]=rnorm(1,sim2[j,]$mean,sim2[j,]$sd)
    a=rgamma(1,alpha[j],scale=beta[j])
    while(a<sim2$min[j]*p){
      a=rgamma(1,alpha[j],scale=beta[j])
    }
    res1[j]=a
  }
  
  class[i,]=sort(res1,index.return=TRUE)$ix
}
return(class=class)
}

The previous function is used to simulate.

class=simula_class(N=10000,alpha=alpha,beta=beta,p=1)
head(class)
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12]
## [1,]    5   12    2    8    4    7    3    6    9    10     1    11
## [2,]    4   11   12    5    8    3    2    7    6     1     9    10
## [3,]   12    4    5    7    2    1    8    6   11     3     9    10
## [4,]   12    2    6   11    8    9    3    7    5     4    10     1
## [5,]    5   12    8    2   11    4    1    7    6     9    10     3
## [6,]    4   12    5    8   10   11    2    7    3     6     9     1

For athletes, the probabilities of 1st, 2nd and 3rd are calculated:

N=dim(class)[1]
prob1=round(100*table(class[,1])/N,2)
filas=as.numeric(names(prob1))
prob_vic=c(0,12)
prob_vic[filas]=prob1

prob1=round(100*table(class[,2])/N,2)
filas=as.numeric(names(prob1))
prob_2=c(0,12)
prob_2[filas]=prob1

prob1=round(100*table(class[,3])/N,2)
filas=as.numeric(names(prob1))
prob_3=c(0,12)
prob_3[filas]=prob1
prob_pod=as.data.frame(cbind(prob_vic,prob_2,prob_3))
prob_pod=prob_pod %>% 
  rowwise() %>% 
  mutate(prob_pod = sum(prob_vic, prob_2,prob_3, na.rm = TRUE))

prob1=cbind(sim2,Prob_VIC=as.vector(prob_vic),Prob_POD=as.vector(prob_pod$prob_pod))
prob1=as.data.frame(prob1)
prob_final=prob1[order(prob1$Prob_VIC,decreasing=TRUE),]
prob_final
##                   Name     mean         sd      min Prob_VIC Prob_POD
## 4   Jakob Ingebrigtsen 3.496407 0.03664868 3.445500    34.24    64.09
## 5            Josh Kerr 3.496417 0.02630888 3.463167    25.90    68.77
## 12        Yared Nuguse 3.500095 0.02410718 3.463333    22.29    65.79
## 2          Cole Hocker 3.530352 0.05361465 3.460833     9.76    28.03
## 11   Timothy Cheruiyot 3.547271 0.06186217 3.478500     2.76    15.53
## 8          Niels Laros 3.515208 0.01555538 3.492333     2.72    34.76
## 1          Brian Komen 3.597205 0.07177189 3.480000     1.05     6.02
## 3        Hobbs Kessler 3.563937 0.05354333 3.490833     0.77     7.02
## 6  NORDAS Narve Gilje  3.559750 0.04339876 3.494667     0.43     5.82
## 7         Neil Gourley 3.548033 0.05041376 3.510833     0.05     2.58
## 10    Stefan Nillessen 3.601019 0.08012015 3.512500     0.02     0.77
## 9         Pietro Arese 3.593444 0.05817663 3.512333     0.01     0.82

It is possible to face questions such as: What is the probability that a runner, Kerr, will be ahead of another, “Ingebrigtsen”?

prob_Kerr_Ingebrigtsen=100*sum(which(class[1:N,]==5)<which(class[1:N,]==4))/N
prob_Kerr_Ingebrigtsen
## [1] 48.69

The exercise is repeated with \(p=1.01\) so that the slow runs of each runner are eliminated.

class=simula_class(N=10000,alpha=alpha,beta=beta,p=1.01)
head(class)
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12]
## [1,]    5   12    4    8    1    2   10    7    6     3    11     9
## [2,]    5    2    4   12    8    1   11    3    7     9     6    10
## [3,]   12    2    8    3    1    4    5   10    6    11     7     9
## [4,]    2    4    5   12    8    9   11    3    1     7     6    10
## [5,]    4    5   12    2    8    9    3   10    7     6    11     1
## [6,]    5   12    3    8    4   11    9   10    1     2     6     7

Once again, the probabilities of 1st, 2nd and 3rd are calculated:

N=dim(class)[1]
prob1=round(100*table(class[,1])/N,2)
filas=as.numeric(names(prob1))
prob_vic=c(0,12)
prob_vic[filas]=prob1

prob1=round(100*table(class[,2])/N,2)
filas=as.numeric(names(prob1))
prob_2=c(0,12)
prob_2[filas]=prob1

prob1=round(100*table(class[,3])/N,2)
filas=as.numeric(names(prob1))
prob_3=c(0,12)
prob_3[filas]=prob1
prob_pod=as.data.frame(cbind(prob_vic,prob_2,prob_3))
prob_pod=prob_pod %>% 
  rowwise() %>% 
  mutate(prob_pod = sum(prob_vic, prob_2,prob_3, na.rm = TRUE))

prob1=cbind(sim2,Prob_VIC=as.vector(prob_vic),Prob_POD=as.vector(prob_pod$prob_pod))
prob1=as.data.frame(prob1)

prob_rapido=prob1[order(prob1$Prob_VIC,decreasing=TRUE),]
prob_rapido
##                   Name     mean         sd      min Prob_VIC Prob_POD
## 4   Jakob Ingebrigtsen 3.496407 0.03664868 3.445500    43.34    73.83
## 12        Yared Nuguse 3.500095 0.02410718 3.463333    24.06    78.66
## 5            Josh Kerr 3.496417 0.02630888 3.463167    23.80    78.31
## 2          Cole Hocker 3.530352 0.05361465 3.460833     7.58    28.96
## 11   Timothy Cheruiyot 3.547271 0.06186217 3.478500     0.64    10.28
## 8          Niels Laros 3.515208 0.01555538 3.492333     0.38    21.79
## 1          Brian Komen 3.597205 0.07177189 3.480000     0.14     3.78
## 3        Hobbs Kessler 3.563937 0.05354333 3.490833     0.05     2.87
## 6  NORDAS Narve Gilje  3.559750 0.04339876 3.494667     0.01     1.45
## 7         Neil Gourley 3.548033 0.05041376 3.510833       NA     0.04
## 9         Pietro Arese 3.593444 0.05817663 3.512333       NA     0.02
## 10    Stefan Nillessen 3.601019 0.08012015 3.512500       NA     0.01

The results are plotted.

datos=merge(prob_final,prob_rapido,by="Name")
datos=datos[,-c(2,3,4,7,8,9)]
colnames(datos)=c("Name","Vic_final","Pod_final","Vic_rapida","Pod_rapida")
datos[is.na(datos)]=0
datos=datos[order(datos$Vic_final,decreasing=TRUE),]
rownames(datos)=seq(1:12)

datos$Name=as.factor(datos$Name)
datos$Name <- factor(datos$Name, levels = (datos$Name)[order(datos$Vic_final, decreasing = TRUE)])

fig <- plot_ly(datos, x = ~Name, y = ~Vic_final, type = 'bar', name = 'Nomal paced')
fig <- fig %>% add_trace(y = ~Vic_rapida, name = 'Fast paced')
fig <- fig %>% layout(yaxis = list(title = 'Probability'), barmode = 'group',xaxis=list(title = ' '))
fig <- fig %>% layout(title = list(text='Golden probabilities for the finalists',y=.975))

fig

The graph reflects the odds of winning the race for each rider in two different scenarios: when the race is fast and in a normal scenario.

datos$Name=as.factor(datos$Name)
datos$Name <- factor(datos$Name, levels = (datos$Name)[order(datos$Pod_final, decreasing = TRUE)])

fig <- plot_ly(datos, x = ~Name, y = ~Pod_final, type = 'bar', name = 'Nomal paced')
fig <- fig %>% add_trace(y = ~Pod_rapida, name = 'Fast paced')
fig <- fig %>% layout(yaxis = list(title = 'Probability'), barmode = 'group',xaxis=list(title = ' '))
fig <- fig %>% layout(title = list(text='Podium probabilities for the finalists',y=.975))

fig

The graph reflects the podium probabilities (finishing the race in the top three) for each rider in two different scenarios: when the race is fast and in a normal scenario.


Isaac